{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import random\n",
    "import numpy as np\n",
    "import csv\n",
    "import jieba\n",
    "\n",
    "\n",
    "file_path = './data/review.csv'\n",
    "jieba.load_userdict(\"./data/userdict.txt\")\n",
    "\n",
    "\n",
    "def load_corpus(corpus_path):\n",
    "    with open(corpus_path, 'r') as f:\n",
    "        reader = csv.reader(f)\n",
    "        rows = [row for row in reader]\n",
    "\n",
    "    # 将读取出来的语料转为list\n",
    "    review_data = np.array(rows).tolist()\n",
    "    # 打乱语料的顺序\n",
    "    random.shuffle(review_data)\n",
    "\n",
    "    review_list = []\n",
    "    sentiment_list = []\n",
    "    # 第一列为差评/好评， 第二列为评论\n",
    "    for words in review_data:\n",
    "        review_list.append(words[1])\n",
    "        sentiment_list.append(words[0])\n",
    "\n",
    "    return review_list, sentiment_list\n",
    "\n",
    "review_list, sentiment_list = load_corpus(file_path)\n",
    "\n",
    "print(review_list[:10], sentiment_list[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = len(review_list) // 5\n",
    "\n",
    "train_review_list, train_sentiment_list = review_list[n:], sentiment_list[n:]\n",
    "test_review_list, test_sentiment_list = review_list[:n], sentiment_list[:n]\n",
    "\n",
    "print('训练集数量： {}'.format(str(len(train_review_list))))\n",
    "print('测试集数量： {}'.format(str(len(test_review_list))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import jieba\n",
    "\n",
    "\n",
    "stopword_path = './data/stopwords.txt'\n",
    "\n",
    "\n",
    "def load_stopwords(file_path):\n",
    "    stop_words = []\n",
    "    with open(file_path, encoding='UTF-8') as words:\n",
    "       stop_words.extend([i.strip() for i in words.readlines()])\n",
    "    return stop_words\n",
    "\n",
    "\n",
    "def review_to_text(review):\n",
    "    stop_words = load_stopwords(stopword_path)\n",
    "    # 去除英文\n",
    "    review = re.sub(\"[^\\u4e00-\\u9fa5^a-z^A-Z]\", '', review)\n",
    "    review = jieba.cut(review)\n",
    "    # 去掉停用词\n",
    "    if stop_words:\n",
    "        all_stop_words = set(stop_words)\n",
    "        words = [w for w in review if w not in all_stop_words]\n",
    "\n",
    "    return words\n",
    "\n",
    "\n",
    "# 用于训练的评论\n",
    "review_train = [' '.join(review_to_text(review)) for review in train_review_list]\n",
    "# 对于训练评论对应的好评/差评\n",
    "sentiment_train = train_sentiment_list\n",
    "\n",
    "# 用于测试的评论\n",
    "review_test = [' '.join(review_to_text(review)) for review in test_review_list]\n",
    "# 对于测试评论对应的好评/差评\n",
    "sentiment_test = test_sentiment_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "count_vec = CountVectorizer(max_df=0.8, min_df=3)\n",
    "\n",
    "tfidf_vec = TfidfVectorizer()\n",
    "\n",
    "# 定义Pipeline对全部步骤的流式化封装和管理，可以很方便地使参数集在新数据集（比如测试集）上被重复使用。\n",
    "def MNB_Classifier():\n",
    "    return Pipeline([\n",
    "        ('count_vec', CountVectorizer()),\n",
    "        ('mnb', MultinomialNB())\n",
    "    ])\n",
    "\n",
    "mnbc_clf = MNB_Classifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnbc_clf = MNB_Classifier()\n",
    "\n",
    "# 进行训练\n",
    "mnbc_clf.fit(review_train, sentiment_train)\n",
    "\n",
    "# 测试集准确率\n",
    "print('测试集准确率： {}'.format(mnbc_clf.score(review_test, sentiment_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
